import pandas as pd
import seaborn as sns
import numpy as np
import requests
import plotly.express as px
import plotly.offline as po
import plotly.graph_objects as go
import sqlite3
from pathlib import Path
from sklearn.linear_model import LinearRegression
from scipy.stats import zscore
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
# set views for notebook
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
Data was downloaded from [here](http://web.mta.info/developers/turnstile.html) and then put into a loal database file called "mta_data.db"
# location of db file
db_path = Path("/Users/randy.grant/training/metis/courses/projects/01_mta/database/")
# open a connection to the local mta_data.db file
con = sqlite3.connect(db_path / "mta_data.db")
# make dataframe from the database query
df = pd.read_sql("select * from mta_data", con)
# close connection
con.close()
df.shape
df.columns
df.dtypes
df.sample(5).style
# lowercase all
# ref: https://stackoverflow.com/a/50084009
df.rename(columns=str.lower, inplace=True)
df = df.applymap(lambda x: x.lower() if type(x) == str else x)
# make a timestamp column based on combined columns of DATE and TIME
df['timestamp'] = pd.to_datetime(df['date'] + df['time'], format='%m/%d/%Y%H:%M:%S')
# make the timestamp split out better
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month.map("{:02}".format)
df['day'] = df['timestamp'].dt.day.map("{:02}".format)
df['hour'] = df['timestamp'].dt.hour.map("{:02}".format)
df['minute'] = df['timestamp'].dt.minute.map("{:02}".format)
df['second'] = df['timestamp'].dt.second.map("{:02}".format)
# sort and remove duplicates
df.sort_values(["c/a", "unit", "scp", "station", "timestamp"], inplace=True, ascending=False)
df.drop_duplicates(subset=["c/a", "unit", "scp", "station", "timestamp"], inplace=True)
# drop unused columns
df = df.drop(["exits", "desc", "date", "time"], axis=1, errors="ignore")
# check the data to ensure changes
print(df.columns)
print(df.shape)
# check to see the data only has times from 1-1-2020 to 12-31-2021
print(df.timestamp.min())
print(df.timestamp.max())
# go get the top 5 according to the mta's website content
top5 = pd.read_html(requests.get("https://new.mta.info/agency/new-york-city-transit/subway-bus-ridership-2020").content)[1].head(5)
top5 = top5.applymap(lambda x: x.lower() if type(x) == str else x)
top5["Station/Complex"].to_list()
# check if all 5 are there in the df
df['station'][df['station'].isin(top5["Station/Complex"].to_list())].unique()
# 'grand central-42 st' isn't there so I need to find it
df['station'][df['station'].str.contains('42')].unique()
# it's named 'grd cntrl-42 st' so rename to that the value in the top5 df
top5['Station/Complex'].replace({'grand central-42 st': 'grd cntrl-42 st'}, inplace=True)
# now all 5 are there so make the df just those top 5
df = df[df['station'].isin(top5["Station/Complex"].to_list())]
# working on references from MTA 2 - 3 exercises
# find daily entries per turnstile
# "first" or "max" provides same in this case
ts_daily = df.groupby(["c/a", "unit", "scp", "station", "year", "month", "day"], as_index=False)['entries'].max()
# tried "rolling" instead of "shift"
ts_daily['daily_entries'] = ts_daily.groupby(["c/a", "unit", "scp", "station"], as_index=False).rolling(2).apply(lambda x: x.iloc[1] - x.iloc[0])["entries"]
# get rid of NaNs which indicate the earliest date's values
ts_daily.dropna(subset=["daily_entries"], axis=0, inplace=True)
ts_daily.head()
# since min time is 2019 and max is in 2022, remove those rows
ts_daily = ts_daily[~ts_daily.year.isin([2019, 2022])]
# determine the lower and upper bounds in iqr to deal with outliers
# ref: https://androidkt.com/detect-and-remove-outliers-from-pandas-dataframe/
q1=ts_daily['daily_entries'].quantile(0.25)
q3=ts_daily['daily_entries'].quantile(0.75)
iqr=q3-q1
lower_bound=q1 - 1.5 * iqr
upper_bound=q3 + 1.5 * iqr
# only keep rows such that the lower_bound and upper_bound are between the 25th and 75th percentile
ts_daily = ts_daily[~((ts_daily['daily_entries'] < lower_bound) | (ts_daily['daily_entries'] > upper_bound))].sort_values(by='daily_entries', ascending=False)
rider_summary = pd.read_html(requests.get("https://new.mta.info/agency/new-york-city-transit/subway-bus-ridership-2020").content)[0]
rider_summary
# most of this function was the instructor's as per "mta-pair-solution.ipynb" - modified a bit
def get_daily_counts(row, max_counter):
counter = abs(row["daily_entries"])
if counter > max_counter:
while counter > max_counter:
counter = counter / 3
return counter
return counter
# setting the max_counter to the median ridership per day
max_counter = int(np.max(rider_summary[['Average Weekday', 'Average Weekend']].sum(axis=1)))
ts_daily["daily_entries"] = ts_daily.apply(get_daily_counts, axis=1, max_counter=max_counter)
ts_daily.head()
stations_daily = ts_daily.groupby(["station", "year", "month", "day"])[['daily_entries']].sum().reset_index()
stations_daily['date'] = pd.to_datetime(stations_daily[["year", "month", "day"]])
stations_daily.head()
# set size
plt.figure(figsize=(20,10))
# tell it where to get the data and where to put it
ax = sns.lineplot(data=stations_daily, x="date", y="daily_entries", hue="station")
# set the number format to not be in scientific notation
plt.ticklabel_format(style='plain', axis='y')
# temp fix for "fixedformatter should only be used together with fixedlocator" error
# ref: https://stackoverflow.com/a/63755285
ticks_loc = ax.get_yticks().tolist()
ax.yaxis.set_major_locator(mpl.ticker.FixedLocator(ticks_loc))
locs, labels = plt.xticks()
# rotate x axis labels
# ref: https://www.delftstack.com/howto/seaborn/rotate-tick-labels-seaborn/
plt.setp(labels, rotation=45)
# add commas to the y-axis numbers
# ref: https://stackoverflow.com/questions/25973581/how-do-i-format-axis-number-format-to-thousands-with-a-comma-in-matplotlib
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
# title and axis lables
ax.set_title("Top 5 Stations with its Daily Entries from 2020 to 2021", fontsize=20)
ax.set_xlabel("Year with Month", fontsize = 15)
ax.set_ylabel("Daily Entries", fontsize = 15);
plt.figure(figsize=(20,10))
ax = sns.scatterplot(data=stations_daily, x="date", y="daily_entries", hue="station")
plt.ticklabel_format(style='plain', axis='y')
plt.setp(labels, rotation=45)
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
ax.set_title("Top 5 Stations with its Daily Entries from 2020 to 2021", fontsize=20)
ax.set_xlabel("Year with Month", fontsize = 15)
ax.set_ylabel("Daily Entries", fontsize = 15);
plt.figure(figsize=(20,10))
ax = sns.barplot(data=stations_daily, x="station", y="daily_entries")
ticks_loc = ax.get_yticks().tolist()
ax.yaxis.set_major_locator(mpl.ticker.FixedLocator(ticks_loc))
locs, labels = plt.xticks()
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
ax.set_title("Top 5 Stations Rollup for the 25th to 75th Percentile (IQR Based)", fontsize=20)
ax.set_xlabel("Station Name", fontsize = 15)
ax.set_ylabel("Daily Entries", fontsize = 15);
x0 = stations_daily[stations_daily['station'] == 'times sq-42 st']
x1 = stations_daily[stations_daily['station'] == 'grd cntrl-42 st']
x2 = stations_daily[stations_daily['station'] == '34 st-herald sq']
x3 = stations_daily[stations_daily['station'] == '14 st-union sq']
x4 = stations_daily[stations_daily['station'] == 'fulton st']
fig, axes = plt.subplots(2, 3, figsize=(20, 10), sharey=True)
fig.delaxes(axes[1,1])
plt.ticklabel_format(style='plain', axis='y')
fig.suptitle('IQR of Daily Entries of the Top 5 Stations', fontsize=20)
sns.barplot(ax=axes[0,0], data=x0, x="station", y="daily_entries")
axes[0,0].set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
sns.barplot(ax=axes[0,1], data=x1, x="station", y="daily_entries")
axes[0,1].set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
sns.barplot(ax=axes[0,2], data=x2, x="station", y="daily_entries")
axes[0,2].set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
sns.barplot(ax=axes[1,0], data=x3, x="station", y="daily_entries")
axes[1,0].set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
sns.barplot(ax=axes[1,2], data=x4, x="station", y="daily_entries")
axes[1,2].set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
# ref: https://plotly.com/python/time-series/#summarizing-timeseries-data-with-histograms
# make plotly be able to be exported correctly from a notebook
po.init_notebook_mode()
# begin plotly fun
fig = px.histogram(stations_daily, x="date", y="daily_entries", histfunc="sum", title="Total Daily Entries of the Top 5 Stations - Binned Monthly")
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=False, ticklabelmode="instant", dtick="M3", tickformat="%m-%Y")
fig.update_layout(bargap=0.3, width=1500, height=500, paper_bgcolor="lightcyan")
fig.show()
# ref: https://plotly.com/python/histograms/
x0 = stations_daily['daily_entries'][stations_daily['station'] == 'times sq-42 st']
x1 = stations_daily['daily_entries'][stations_daily['station'] == 'grd cntrl-42 st']
x2 = stations_daily['daily_entries'][stations_daily['station'] == '34 st-herald sq']
x3 = stations_daily['daily_entries'][stations_daily['station'] == '14 st-union sq']
x4 = stations_daily['daily_entries'][stations_daily['station'] == 'fulton st']
fig = go.Figure(go.Histogram(
x=x0,
name="times sq-42 st",
bingroup=1))
fig.add_trace(go.Histogram(
x=x1,
name="grd cntrl-42 st",
bingroup=1))
fig.add_trace(go.Histogram(
x=x2,
name="34 st-herald sq",
bingroup=1))
fig.add_trace(go.Histogram(
x=x3,
name="14 st-union sq",
bingroup=1))
fig.add_trace(go.Histogram(
x=x4,
name="fulton st",
bingroup=1))
fig.update_layout(
barmode="overlay",
bargap=0.3,
width=1500,
height=500,
paper_bgcolor="lightcyan",
title="Spread of Data Across the Top 5 Stations",
xaxis_title="Daily Entry Bins",
yaxis_title="Count of Daily Entries in Bin",
legend_title="Stations",
yaxis_type="linear")
fig.show()
# ref: https://stackoverflow.com/a/69177333
# make the date the index
stations_daily.set_index('date', inplace=True)
# add an ordinal column because sklearn doesn't work with datetimes
stations_daily['ordinal'] = stations_daily.index.map(pd.Timestamp.toordinal)
# create the model
model = LinearRegression()
# extract x and y from dataframe data
x = stations_daily[['ordinal']]
y = stations_daily[['daily_entries']]
# fit the mode
model.fit(x, y)
# print the slope and intercept if desired
print('intercept:', model.intercept_[0])
print('slope:', model.coef_[0][0])
# select x1 and x2 and get the corresponding date from the index
x1 = stations_daily.ordinal.min()
x1_date = stations_daily[stations_daily.ordinal.eq(x1)].index[0]
x2 = stations_daily.ordinal.max()
x2_date = stations_daily[stations_daily.ordinal.eq(x2)].index[0]
# calculate y1, given x1
y1 = model.predict(np.array([[x1]]))[0][0]
print('y1:', y1)
# calculate y2, given x2
y2 = model.predict(np.array([[x2]]))[0][0]
print('y2:', y2)
# plot it
ax1 = stations_daily.plot(y='daily_entries', c='black', figsize=(20, 10), grid=True, legend=False, label="Volume of Daily Entries", title="Volume of Top 5 Stations with Linear Trend Line")
locs, labels = plt.xticks()
plt.ticklabel_format(style='plain', axis='y')
plt.setp(labels, rotation=45)
ax1.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
ax1.plot([x1_date, x2_date], [y1, y2], label='Trend Line', c='red')
ax1.legend(fontsize=15)
ax1.set_xlabel("Year and Month", fontsize = 15)
ax1.set_ylabel("Daily Entries", fontsize = 15);
# reset the index
stations_daily.reset_index(inplace=True)